/*
 * Copyright (c) 2019-2020 Arm Limited.
 *
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to
 * deal in the Software without restriction, including without limitation the
 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 * sell copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */
#ifdef __aarch64__

#include "arm_gemm.hpp"
#include "../../utils.hpp"

#include <cassert>
#include <limits>

namespace arm_gemm {

void a64_gemv_fp32_mla_32 (
    const float *A_ptr, const float *B_ptr, float *output_ptr,
    size_t N, size_t K,
    const float *bias, Activation act, bool
)
{
    struct KernelArgs {
        float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
        float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
        const float *B_ptr = {};
        size_t output_offset = {};
        unsigned int input_initial_col = {};
    } ka;

    unsigned long flags=0;
    ka.B_ptr = B_ptr;
    switch(act.type) {
        default:
        case Activation::Type::None:
            break;
        case Activation::Type::BoundedReLU:
            ka.maxval = static_cast<float>(act.param1);
            /* fall through */
        case Activation::Type::ReLU:
            ka.minval = 0;
            flags |= 0x2;
            break;
    }
    __asm__ __volatile__(
      "add x22, %x[N], #0x3\n"
      "mov x21, %x[bias]\n"
      "lsr x22, x22, #0x2\n"
      "1:"  // Column loop
      "cmp x22, #0x8\n"
      "bge 85f\n"
      "cmp x22, #0x6\n"
      "bgt 73f\n"
      "beq 61f\n"
      "cmp x22, #0x4\n"
      "bgt 49f\n"
      "beq 37f\n"
      "cmp x22, #0x2\n"
      "bgt 25f\n"
      "beq 13f\n"
      "mov x20, %x[K]\n"
      "mov x19, %x[A_ptr]\n"
      "cbz x21, 2f\n"
      "ldr q24, [x21, #0x0]\n"
      "add x21, x21, #0x10\n"
      "b 3f\n"
      "2:"  // Width 1: no bias
      "movi v24.16b, #0x0\n"
      "3:"  // Width 1: setup done
      "cmp x20, #0x4\n"
      "blt 6f\n"
      "cmp x20, #0x8\n"
      "blt 5f\n"
      "4:"  // Width 1: Multiply loop: Main loop head
      "ldr q0, [x19, #0x0]\n"
      "ldr q1, [%x[B_ptr], #0x0]\n"
      "fmla v24.4s, v1.4s, v0.s[0]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
      "ldr q2, [%x[B_ptr], #0x0]\n"
      "fmla v24.4s, v2.4s, v0.s[1]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
      "ldr q3, [%x[B_ptr], #0x0]\n"
      "fmla v24.4s, v3.4s, v0.s[2]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
      "ldr q4, [%x[B_ptr], #0x0]\n"
      "fmla v24.4s, v4.4s, v0.s[3]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
      "add x19, x19, #0x10\n"
      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
      "sub x20, x20, #0x4\n"
      "prfm pldl1keep, [x19, #0x80]\n"
      "cmp x20, #0x8\n"
      "bge 4b\n"
      "5:"  // Width 1: Multiply loop: Single iteration only
      "sub x20, x20, #0x4\n"
      "ldr q0, [x19, #0x0]\n"
      "ldr q5, [%x[B_ptr], #0x0]\n"
      "fmla v24.4s, v5.4s, v0.s[0]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
      "ldr q6, [%x[B_ptr], #0x0]\n"
      "fmla v24.4s, v6.4s, v0.s[1]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
      "ldr q7, [%x[B_ptr], #0x0]\n"
      "fmla v24.4s, v7.4s, v0.s[2]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
      "ldr q8, [%x[B_ptr], #0x0]\n"
      "fmla v24.4s, v8.4s, v0.s[3]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
      "add x19, x19, #0x10\n"
      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
      "prfm pldl1keep, [x19, #0x80]\n"
      "6:"  // Width 1: Multiply loop: Main loop skip
      "cbz x20, 8f\n"
      "7:"  // Width 1: Multiply loop: Odd block loop
      "ldr s0, [x19], #0x4\n"
      "ldr q9, [%x[B_ptr], #0x0]\n"
      "fmla v24.4s, v9.4s, v0.s[0]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "sub x20, x20, #0x1\n"
      "cbnz x20, 7b\n"
      "8:"  // Width 1: Multiply loop: No odd multiplies
      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
      "tbz %x[flags], #1, 9f\n"
      "add x19, %x[args_ptr], %[offset_min]\n"
      "ld1r { v17.4s }, [x19]\n"
      "add x19, %x[args_ptr], %[offset_max]\n"
      "ld1r { v16.4s }, [x19]\n"
      "fmin v24.4s, v24.4s, v16.4s\n"
      "fmax v24.4s, v24.4s, v17.4s\n"
      "9:"  // Width 1: No activation
      "cmp %x[N], #0x4\n"
      "blt 10f\n"
      "str q24, [%x[output_ptr], #0x0]\n"
      "add %x[output_ptr], %x[output_ptr], #0x10\n"
      "b 12f\n"
      "10:"  // Width 1: Partial writeback
      "tbz %x[N], #1, 11f\n"
      "str d24, [%x[output_ptr]], #0x8\n"
      "tbz %x[N], #0, 12f\n"
      "st1 { v24.s }[2], [%x[output_ptr]]\n"
      "b 12f\n"
      "11:"  // Width 1: Partial direct writeback: partial_1_0
      "str s24, [%x[output_ptr], #0x0]\n"
      "12:"  // Width 1: Writeback done
      "b 97f\n"
      "13:"  // Width 2
      "mov x20, %x[K]\n"
      "mov x19, %x[A_ptr]\n"
      "cbz x21, 14f\n"
      "ldr q24, [x21, #0x0]\n"
      "ldr q25, [x21, #0x10]\n"
      "add x21, x21, #0x20\n"
      "b 15f\n"
      "14:"  // Width 2: no bias
      "movi v24.16b, #0x0\n"
      "movi v25.16b, #0x0\n"
      "15:"  // Width 2: setup done
      "cmp x20, #0x4\n"
      "blt 18f\n"
      "cmp x20, #0x8\n"
      "blt 17f\n"
      "16:"  // Width 2: Multiply loop: Main loop head
      "ldr q0, [x19, #0x0]\n"
      "ldr q1, [%x[B_ptr], #0x0]\n"
      "fmla v24.4s, v1.4s, v0.s[0]\n"
      "ldr q2, [%x[B_ptr], #0x10]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "fmla v25.4s, v2.4s, v0.s[0]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
      "ldr q3, [%x[B_ptr], #0x0]\n"
      "fmla v24.4s, v3.4s, v0.s[1]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
      "ldr q4, [%x[B_ptr], #0x10]\n"
      "fmla v25.4s, v4.4s, v0.s[1]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
      "ldr q5, [%x[B_ptr], #0x0]\n"
      "fmla v24.4s, v5.4s, v0.s[2]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
      "ldr q6, [%x[B_ptr], #0x10]\n"
      "fmla v25.4s, v6.4s, v0.s[2]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
      "ldr q7, [%x[B_ptr], #0x0]\n"
      "fmla v24.4s, v7.4s, v0.s[3]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
      "ldr q8, [%x[B_ptr], #0x10]\n"
      "fmla v25.4s, v8.4s, v0.s[3]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
      "add x19, x19, #0x10\n"
      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
      "sub x20, x20, #0x4\n"
      "prfm pldl1keep, [x19, #0x80]\n"
      "cmp x20, #0x8\n"
      "bge 16b\n"
      "17:"  // Width 2: Multiply loop: Single iteration only
      "sub x20, x20, #0x4\n"
      "ldr q0, [x19, #0x0]\n"
      "ldr q9, [%x[B_ptr], #0x0]\n"
      "fmla v24.4s, v9.4s, v0.s[0]\n"
      "ldr q10, [%x[B_ptr], #0x10]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "fmla v25.4s, v10.4s, v0.s[0]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
      "ldr q11, [%x[B_ptr], #0x0]\n"
      "fmla v24.4s, v11.4s, v0.s[1]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
      "ldr q12, [%x[B_ptr], #0x10]\n"
      "fmla v25.4s, v12.4s, v0.s[1]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
      "ldr q13, [%x[B_ptr], #0x0]\n"
      "fmla v24.4s, v13.4s, v0.s[2]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
      "ldr q14, [%x[B_ptr], #0x10]\n"
      "fmla v25.4s, v14.4s, v0.s[2]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
      "ldr q15, [%x[B_ptr], #0x0]\n"
      "fmla v24.4s, v15.4s, v0.s[3]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
      "ldr q16, [%x[B_ptr], #0x10]\n"
      "fmla v25.4s, v16.4s, v0.s[3]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
      "add x19, x19, #0x10\n"
      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
      "prfm pldl1keep, [x19, #0x80]\n"
      "18:"  // Width 2: Multiply loop: Main loop skip
      "cbz x20, 20f\n"
      "19:"  // Width 2: Multiply loop: Odd block loop
      "ldr s0, [x19], #0x4\n"
      "ldr q17, [%x[B_ptr], #0x0]\n"
      "fmla v24.4s, v17.4s, v0.s[0]\n"
      "ldr q18, [%x[B_ptr], #0x10]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "fmla v25.4s, v18.4s, v0.s[0]\n"
      "sub x20, x20, #0x1\n"
      "cbnz x20, 19b\n"
      "20:"  // Width 2: Multiply loop: No odd multiplies
      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
      "tbz %x[flags], #1, 21f\n"
      "add x19, %x[args_ptr], %[offset_min]\n"
      "ld1r { v17.4s }, [x19]\n"
      "add x19, %x[args_ptr], %[offset_max]\n"
      "ld1r { v16.4s }, [x19]\n"
      "fmin v24.4s, v24.4s, v16.4s\n"
      "fmin v25.4s, v25.4s, v16.4s\n"
      "fmax v24.4s, v24.4s, v17.4s\n"
      "fmax v25.4s, v25.4s, v17.4s\n"
      "21:"  // Width 2: No activation
      "str q24, [%x[output_ptr], #0x0]\n"
      "cmp %x[N], #0x8\n"
      "add %x[output_ptr], %x[output_ptr], #0x10\n"
      "blt 22f\n"
      "str q25, [%x[output_ptr], #0x0]\n"
      "add %x[output_ptr], %x[output_ptr], #0x10\n"
      "b 24f\n"
      "22:"  // Width 2: Partial writeback
      "tbz %x[N], #1, 23f\n"
      "str d25, [%x[output_ptr]], #0x8\n"
      "tbz %x[N], #0, 24f\n"
      "st1 { v25.s }[2], [%x[output_ptr]]\n"
      "b 24f\n"
      "23:"  // Width 2: Partial direct writeback: partial_1_4
      "tbz %x[N], #0, 24f\n"
      "str s25, [%x[output_ptr], #0x0]\n"
      "24:"  // Width 2: Writeback done
      "b 97f\n"
      "25:"  // Width 3
      "mov x20, %x[K]\n"
      "mov x19, %x[A_ptr]\n"
      "cbz x21, 26f\n"
      "ldr q24, [x21, #0x0]\n"
      "ldr q25, [x21, #0x10]\n"
      "ldr q26, [x21, #0x20]\n"
      "add x21, x21, #0x30\n"
      "b 27f\n"
      "26:"  // Width 3: no bias
      "movi v24.16b, #0x0\n"
      "movi v25.16b, #0x0\n"
      "movi v26.16b, #0x0\n"
      "27:"  // Width 3: setup done
      "cmp x20, #0x4\n"
      "blt 30f\n"
      "cmp x20, #0x8\n"
      "blt 29f\n"
      "28:"  // Width 3: Multiply loop: Main loop head
      "ldr q0, [x19, #0x0]\n"
      "ldr q1, [%x[B_ptr], #0x0]\n"
      "fmla v24.4s, v1.4s, v0.s[0]\n"
      "ldr q2, [%x[B_ptr], #0x10]\n"
      "ldr q3, [%x[B_ptr], #0x20]\n"
      "fmla v25.4s, v2.4s, v0.s[0]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
      "fmla v26.4s, v3.4s, v0.s[0]\n"
      "ldr q4, [%x[B_ptr], #0x0]\n"
      "fmla v24.4s, v4.4s, v0.s[1]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
      "ldr q5, [%x[B_ptr], #0x10]\n"
      "fmla v25.4s, v5.4s, v0.s[1]\n"
      "ldr q6, [%x[B_ptr], #0x20]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "fmla v26.4s, v6.4s, v0.s[1]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
      "ldr q7, [%x[B_ptr], #0x0]\n"
      "fmla v24.4s, v7.4s, v0.s[2]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
      "ldr q8, [%x[B_ptr], #0x10]\n"
      "fmla v25.4s, v8.4s, v0.s[2]\n"
      "ldr q9, [%x[B_ptr], #0x20]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "fmla v26.4s, v9.4s, v0.s[2]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
      "ldr q10, [%x[B_ptr], #0x0]\n"
      "fmla v24.4s, v10.4s, v0.s[3]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
      "ldr q11, [%x[B_ptr], #0x10]\n"
      "fmla v25.4s, v11.4s, v0.s[3]\n"
      "ldr q12, [%x[B_ptr], #0x20]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "fmla v26.4s, v12.4s, v0.s[3]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
      "add x19, x19, #0x10\n"
      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
      "sub x20, x20, #0x4\n"
      "prfm pldl1keep, [x19, #0x80]\n"
      "cmp x20, #0x8\n"
      "bge 28b\n"
      "29:"  // Width 3: Multiply loop: Single iteration only
      "sub x20, x20, #0x4\n"
      "ldr q0, [x19, #0x0]\n"
      "ldr q13, [%x[B_ptr], #0x0]\n"
      "fmla v24.4s, v13.4s, v0.s[0]\n"
      "ldr q14, [%x[B_ptr], #0x10]\n"
      "ldr q15, [%x[B_ptr], #0x20]\n"
      "fmla v25.4s, v14.4s, v0.s[0]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
      "fmla v26.4s, v15.4s, v0.s[0]\n"
      "ldr q16, [%x[B_ptr], #0x0]\n"
      "fmla v24.4s, v16.4s, v0.s[1]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
      "ldr q17, [%x[B_ptr], #0x10]\n"
      "fmla v25.4s, v17.4s, v0.s[1]\n"
      "ldr q18, [%x[B_ptr], #0x20]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "fmla v26.4s, v18.4s, v0.s[1]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
      "ldr q19, [%x[B_ptr], #0x0]\n"
      "fmla v24.4s, v19.4s, v0.s[2]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
      "ldr q20, [%x[B_ptr], #0x10]\n"
      "fmla v25.4s, v20.4s, v0.s[2]\n"
      "ldr q21, [%x[B_ptr], #0x20]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "fmla v26.4s, v21.4s, v0.s[2]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
      "ldr q22, [%x[B_ptr], #0x0]\n"
      "fmla v24.4s, v22.4s, v0.s[3]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
      "ldr q23, [%x[B_ptr], #0x10]\n"
      "fmla v25.4s, v23.4s, v0.s[3]\n"
      "ldr q1, [%x[B_ptr], #0x20]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "fmla v26.4s, v1.4s, v0.s[3]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
      "add x19, x19, #0x10\n"
      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
      "prfm pldl1keep, [x19, #0x80]\n"
      "30:"  // Width 3: Multiply loop: Main loop skip
      "cbz x20, 32f\n"
      "31:"  // Width 3: Multiply loop: Odd block loop
      "ldr s0, [x19], #0x4\n"
      "ldr q2, [%x[B_ptr], #0x0]\n"
      "fmla v24.4s, v2.4s, v0.s[0]\n"
      "ldr q3, [%x[B_ptr], #0x10]\n"
      "ldr q4, [%x[B_ptr], #0x20]\n"
      "fmla v25.4s, v3.4s, v0.s[0]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "fmla v26.4s, v4.4s, v0.s[0]\n"
      "sub x20, x20, #0x1\n"
      "cbnz x20, 31b\n"
      "32:"  // Width 3: Multiply loop: No odd multiplies
      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
      "tbz %x[flags], #1, 33f\n"
      "add x19, %x[args_ptr], %[offset_min]\n"
      "ld1r { v17.4s }, [x19]\n"
      "add x19, %x[args_ptr], %[offset_max]\n"
      "ld1r { v16.4s }, [x19]\n"
      "fmin v24.4s, v24.4s, v16.4s\n"
      "fmin v25.4s, v25.4s, v16.4s\n"
      "fmin v26.4s, v26.4s, v16.4s\n"
      "fmax v24.4s, v24.4s, v17.4s\n"
      "fmax v25.4s, v25.4s, v17.4s\n"
      "fmax v26.4s, v26.4s, v17.4s\n"
      "33:"  // Width 3: No activation
      "str q24, [%x[output_ptr], #0x0]\n"
      "str q25, [%x[output_ptr], #0x10]\n"
      "cmp %x[N], #0xc\n"
      "add %x[output_ptr], %x[output_ptr], #0x20\n"
      "blt 34f\n"
      "str q26, [%x[output_ptr], #0x0]\n"
      "add %x[output_ptr], %x[output_ptr], #0x10\n"
      "b 36f\n"
      "34:"  // Width 3: Partial writeback
      "tbz %x[N], #1, 35f\n"
      "str d26, [%x[output_ptr]], #0x8\n"
      "tbz %x[N], #0, 36f\n"
      "st1 { v26.s }[2], [%x[output_ptr]]\n"
      "b 36f\n"
      "35:"  // Width 3: Partial direct writeback: partial_1_8
      "tbz %x[N], #0, 36f\n"
      "str s26, [%x[output_ptr], #0x0]\n"
      "36:"  // Width 3: Writeback done
      "b 97f\n"
      "37:"  // Width 4
      "mov x20, %x[K]\n"
      "mov x19, %x[A_ptr]\n"
      "cbz x21, 38f\n"
      "ldr q24, [x21, #0x0]\n"
      "ldr q25, [x21, #0x10]\n"
      "ldr q26, [x21, #0x20]\n"
      "ldr q27, [x21, #0x30]\n"
      "add x21, x21, #0x40\n"
      "b 39f\n"
      "38:"  // Width 4: no bias
      "movi v24.16b, #0x0\n"
      "movi v25.16b, #0x0\n"
      "movi v26.16b, #0x0\n"
      "movi v27.16b, #0x0\n"
      "39:"  // Width 4: setup done
      "cmp x20, #0x4\n"
      "blt 42f\n"
      "cmp x20, #0x8\n"
      "blt 41f\n"
      "40:"  // Width 4: Multiply loop: Main loop head
      "ldr q0, [x19, #0x0]\n"
      "ldr q1, [%x[B_ptr], #0x0]\n"
      "fmla v24.4s, v1.4s, v0.s[0]\n"
      "ldr q2, [%x[B_ptr], #0x10]\n"
      "ldr q3, [%x[B_ptr], #0x20]\n"
      "fmla v25.4s, v2.4s, v0.s[0]\n"
      "ldr q4, [%x[B_ptr], #0x30]\n"
      "fmla v26.4s, v3.4s, v0.s[0]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
      "fmla v27.4s, v4.4s, v0.s[0]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
      "ldr q5, [%x[B_ptr], #0x0]\n"
      "fmla v24.4s, v5.4s, v0.s[1]\n"
      "ldr q6, [%x[B_ptr], #0x10]\n"
      "ldr q7, [%x[B_ptr], #0x20]\n"
      "fmla v25.4s, v6.4s, v0.s[1]\n"
      "ldr q8, [%x[B_ptr], #0x30]\n"
      "fmla v26.4s, v7.4s, v0.s[1]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
      "fmla v27.4s, v8.4s, v0.s[1]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
      "ldr q9, [%x[B_ptr], #0x0]\n"
      "fmla v24.4s, v9.4s, v0.s[2]\n"
      "ldr q10, [%x[B_ptr], #0x10]\n"
      "ldr q11, [%x[B_ptr], #0x20]\n"
      "fmla v25.4s, v10.4s, v0.s[2]\n"
      "ldr q12, [%x[B_ptr], #0x30]\n"
      "fmla v26.4s, v11.4s, v0.s[2]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
      "fmla v27.4s, v12.4s, v0.s[2]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
      "ldr q13, [%x[B_ptr], #0x0]\n"
      "fmla v24.4s, v13.4s, v0.s[3]\n"
      "ldr q14, [%x[B_ptr], #0x10]\n"
      "ldr q15, [%x[B_ptr], #0x20]\n"
      "fmla v25.4s, v14.4s, v0.s[3]\n"
      "ldr q16, [%x[B_ptr], #0x30]\n"
      "fmla v26.4s, v15.4s, v0.s[3]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
      "fmla v27.4s, v16.4s, v0.s[3]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
      "add x19, x19, #0x10\n"
      "prfm pldl1keep, [x19, #0x80]\n"
      "sub x20, x20, #0x4\n"
      "cmp x20, #0x8\n"
      "bge 40b\n"
      "41:"  // Width 4: Multiply loop: Single iteration only
      "sub x20, x20, #0x4\n"
      "ldr q0, [x19, #0x0]\n"
      "ldr q17, [%x[B_ptr], #0x0]\n"
      "fmla v24.4s, v17.4s, v0.s[0]\n"
      "ldr q18, [%x[B_ptr], #0x10]\n"
      "ldr q19, [%x[B_ptr], #0x20]\n"
      "fmla v25.4s, v18.4s, v0.s[0]\n"
      "ldr q20, [%x[B_ptr], #0x30]\n"
      "fmla v26.4s, v19.4s, v0.s[0]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
      "fmla v27.4s, v20.4s, v0.s[0]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
      "ldr q21, [%x[B_ptr], #0x0]\n"
      "fmla v24.4s, v21.4s, v0.s[1]\n"
      "ldr q22, [%x[B_ptr], #0x10]\n"
      "ldr q23, [%x[B_ptr], #0x20]\n"
      "fmla v25.4s, v22.4s, v0.s[1]\n"
      "ldr q1, [%x[B_ptr], #0x30]\n"
      "fmla v26.4s, v23.4s, v0.s[1]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
      "fmla v27.4s, v1.4s, v0.s[1]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
      "ldr q2, [%x[B_ptr], #0x0]\n"
      "fmla v24.4s, v2.4s, v0.s[2]\n"
      "ldr q3, [%x[B_ptr], #0x10]\n"
      "ldr q4, [%x[B_ptr], #0x20]\n"
      "fmla v25.4s, v3.4s, v0.s[2]\n"
      "ldr q5, [%x[B_ptr], #0x30]\n"
      "fmla v26.4s, v4.4s, v0.s[2]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
      "fmla v27.4s, v5.4s, v0.s[2]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
      "ldr q6, [%x[B_ptr], #0x0]\n"
      "fmla v24.4s, v6.4s, v0.s[3]\n"
      "ldr q7, [%x[B_ptr], #0x10]\n"
      "ldr q8, [%x[B_ptr], #0x20]\n"
      "fmla v25.4s, v7.4s, v0.s[3]\n"
      "ldr q9, [%x[B_ptr], #0x30]\n"
      "fmla v26.4s, v8.4s, v0.s[3]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
      "fmla v27.4s, v9.4s, v0.s[3]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
      "add x19, x19, #0x10\n"
      "prfm pldl1keep, [x19, #0x80]\n"
      "42:"  // Width 4: Multiply loop: Main loop skip
      "cbz x20, 44f\n"
      "43:"  // Width 4: Multiply loop: Odd block loop
      "ldr s0, [x19], #0x4\n"
      "ldr q10, [%x[B_ptr], #0x0]\n"
      "fmla v24.4s, v10.4s, v0.s[0]\n"
      "ldr q11, [%x[B_ptr], #0x10]\n"
      "ldr q12, [%x[B_ptr], #0x20]\n"
      "fmla v25.4s, v11.4s, v0.s[0]\n"
      "ldr q13, [%x[B_ptr], #0x30]\n"
      "fmla v26.4s, v12.4s, v0.s[0]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "sub x20, x20, #0x1\n"
      "fmla v27.4s, v13.4s, v0.s[0]\n"
      "cbnz x20, 43b\n"
      "44:"  // Width 4: Multiply loop: No odd multiplies
      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
      "tbz %x[flags], #1, 45f\n"
      "add x19, %x[args_ptr], %[offset_min]\n"
      "ld1r { v17.4s }, [x19]\n"
      "add x19, %x[args_ptr], %[offset_max]\n"
      "ld1r { v16.4s }, [x19]\n"
      "fmin v24.4s, v24.4s, v16.4s\n"
      "fmin v25.4s, v25.4s, v16.4s\n"
      "fmin v26.4s, v26.4s, v16.4s\n"
      "fmin v27.4s, v27.4s, v16.4s\n"
      "fmax v24.4s, v24.4s, v17.4s\n"
      "fmax v25.4s, v25.4s, v17.4s\n"
      "fmax v26.4s, v26.4s, v17.4s\n"
      "fmax v27.4s, v27.4s, v17.4s\n"
      "45:"  // Width 4: No activation
      "str q24, [%x[output_ptr], #0x0]\n"
      "str q25, [%x[output_ptr], #0x10]\n"
      "str q26, [%x[output_ptr], #0x20]\n"
      "cmp %x[N], #0x10\n"
      "add %x[output_ptr], %x[output_ptr], #0x30\n"
      "blt 46f\n"
      "str q27, [%x[output_ptr], #0x0]\n"
      "add %x[output_ptr], %x[output_ptr], #0x10\n"
      "b 48f\n"
      "46:"  // Width 4: Partial writeback
      "tbz %x[N], #1, 47f\n"
      "str d27, [%x[output_ptr]], #0x8\n"
      "tbz %x[N], #0, 48f\n"
      "st1 { v27.s }[2], [%x[output_ptr]]\n"
      "b 48f\n"
      "47:"  // Width 4: Partial direct writeback: partial_1_12
      "tbz %x[N], #0, 48f\n"
      "str s27, [%x[output_ptr], #0x0]\n"
      "48:"  // Width 4: Writeback done
      "b 97f\n"
      "49:"  // Width 5
      "mov x20, %x[K]\n"
      "mov x19, %x[A_ptr]\n"
      "cbz x21, 50f\n"
      "ldr q24, [x21, #0x0]\n"
      "ldr q25, [x21, #0x10]\n"
      "ldr q26, [x21, #0x20]\n"
      "ldr q27, [x21, #0x30]\n"
      "ldr q28, [x21, #0x40]\n"
      "add x21, x21, #0x50\n"
      "b 51f\n"
      "50:"  // Width 5: no bias
      "movi v24.16b, #0x0\n"
      "movi v25.16b, #0x0\n"
      "movi v26.16b, #0x0\n"
      "movi v27.16b, #0x0\n"
      "movi v28.16b, #0x0\n"
      "51:"  // Width 5: setup done
      "cmp x20, #0x4\n"
      "blt 54f\n"
      "cmp x20, #0x8\n"
      "blt 53f\n"
      "52:"  // Width 5: Multiply loop: Main loop head
      "ldr q0, [x19, #0x0]\n"
      "ldr q1, [%x[B_ptr], #0x0]\n"
      "fmla v24.4s, v1.4s, v0.s[0]\n"
      "ldr q2, [%x[B_ptr], #0x10]\n"
      "ldr q3, [%x[B_ptr], #0x20]\n"
      "fmla v25.4s, v2.4s, v0.s[0]\n"
      "ldr q4, [%x[B_ptr], #0x30]\n"
      "fmla v26.4s, v3.4s, v0.s[0]\n"
      "ldr q5, [%x[B_ptr], #0x40]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "fmla v27.4s, v4.4s, v0.s[0]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
      "ldr q6, [%x[B_ptr], #0x0]\n"
      "fmla v28.4s, v5.4s, v0.s[0]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
      "ldr q7, [%x[B_ptr], #0x10]\n"
      "fmla v24.4s, v6.4s, v0.s[1]\n"
      "ldr q8, [%x[B_ptr], #0x20]\n"
      "ldr q9, [%x[B_ptr], #0x30]\n"
      "fmla v25.4s, v7.4s, v0.s[1]\n"
      "ldr q10, [%x[B_ptr], #0x40]\n"
      "fmla v26.4s, v8.4s, v0.s[1]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
      "fmla v27.4s, v9.4s, v0.s[1]\n"
      "ldr q11, [%x[B_ptr], #0x0]\n"
      "fmla v28.4s, v10.4s, v0.s[1]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
      "ldr q12, [%x[B_ptr], #0x10]\n"
      "fmla v24.4s, v11.4s, v0.s[2]\n"
      "ldr q13, [%x[B_ptr], #0x20]\n"
      "ldr q14, [%x[B_ptr], #0x30]\n"
      "fmla v25.4s, v12.4s, v0.s[2]\n"
      "ldr q15, [%x[B_ptr], #0x40]\n"
      "fmla v26.4s, v13.4s, v0.s[2]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
      "fmla v27.4s, v14.4s, v0.s[2]\n"
      "ldr q16, [%x[B_ptr], #0x0]\n"
      "fmla v28.4s, v15.4s, v0.s[2]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
      "ldr q17, [%x[B_ptr], #0x10]\n"
      "fmla v24.4s, v16.4s, v0.s[3]\n"
      "ldr q18, [%x[B_ptr], #0x20]\n"
      "ldr q19, [%x[B_ptr], #0x30]\n"
      "fmla v25.4s, v17.4s, v0.s[3]\n"
      "ldr q20, [%x[B_ptr], #0x40]\n"
      "fmla v26.4s, v18.4s, v0.s[3]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
      "fmla v27.4s, v19.4s, v0.s[3]\n"
      "add x19, x19, #0x10\n"
      "fmla v28.4s, v20.4s, v0.s[3]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
      "sub x20, x20, #0x4\n"
      "prfm pldl1keep, [x19, #0x80]\n"
      "cmp x20, #0x8\n"
      "bge 52b\n"
      "53:"  // Width 5: Multiply loop: Single iteration only
      "sub x20, x20, #0x4\n"
      "ldr q0, [x19, #0x0]\n"
      "ldr q21, [%x[B_ptr], #0x0]\n"
      "fmla v24.4s, v21.4s, v0.s[0]\n"
      "ldr q22, [%x[B_ptr], #0x10]\n"
      "ldr q23, [%x[B_ptr], #0x20]\n"
      "fmla v25.4s, v22.4s, v0.s[0]\n"
      "ldr q1, [%x[B_ptr], #0x30]\n"
      "fmla v26.4s, v23.4s, v0.s[0]\n"
      "ldr q2, [%x[B_ptr], #0x40]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "fmla v27.4s, v1.4s, v0.s[0]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
      "ldr q3, [%x[B_ptr], #0x0]\n"
      "fmla v28.4s, v2.4s, v0.s[0]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
      "ldr q4, [%x[B_ptr], #0x10]\n"
      "fmla v24.4s, v3.4s, v0.s[1]\n"
      "ldr q5, [%x[B_ptr], #0x20]\n"
      "ldr q6, [%x[B_ptr], #0x30]\n"
      "fmla v25.4s, v4.4s, v0.s[1]\n"
      "ldr q7, [%x[B_ptr], #0x40]\n"
      "fmla v26.4s, v5.4s, v0.s[1]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
      "fmla v27.4s, v6.4s, v0.s[1]\n"
      "ldr q8, [%x[B_ptr], #0x0]\n"
      "fmla v28.4s, v7.4s, v0.s[1]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
      "ldr q9, [%x[B_ptr], #0x10]\n"
      "fmla v24.4s, v8.4s, v0.s[2]\n"
      "ldr q10, [%x[B_ptr], #0x20]\n"
      "ldr q11, [%x[B_ptr], #0x30]\n"
      "fmla v25.4s, v9.4s, v0.s[2]\n"
      "ldr q12, [%x[B_ptr], #0x40]\n"
      "fmla v26.4s, v10.4s, v0.s[2]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
      "fmla v27.4s, v11.4s, v0.s[2]\n"
      "ldr q13, [%x[B_ptr], #0x0]\n"
      "fmla v28.4s, v12.4s, v0.s[2]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
      "ldr q14, [%x[B_ptr], #0x10]\n"
      "fmla v24.4s, v13.4s, v0.s[3]\n"
      "ldr q15, [%x[B_ptr], #0x20]\n"
      "ldr q16, [%x[B_ptr], #0x30]\n"
      "fmla v25.4s, v14.4s, v0.s[3]\n"
      "ldr q17, [%x[B_ptr], #0x40]\n"
      "fmla v26.4s, v15.4s, v0.s[3]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
      "fmla v27.4s, v16.4s, v0.s[3]\n"
      "add x19, x19, #0x10\n"
      "fmla v28.4s, v17.4s, v0.s[3]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
      "prfm pldl1keep, [x19, #0x80]\n"
      "54:"  // Width 5: Multiply loop: Main loop skip
      "cbz x20, 56f\n"
      "55:"  // Width 5: Multiply loop: Odd block loop
      "ldr s0, [x19], #0x4\n"
      "ldr q18, [%x[B_ptr], #0x0]\n"
      "fmla v24.4s, v18.4s, v0.s[0]\n"
      "ldr q19, [%x[B_ptr], #0x10]\n"
      "ldr q20, [%x[B_ptr], #0x20]\n"
      "fmla v25.4s, v19.4s, v0.s[0]\n"
      "ldr q21, [%x[B_ptr], #0x30]\n"
      "fmla v26.4s, v20.4s, v0.s[0]\n"
      "ldr q22, [%x[B_ptr], #0x40]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "fmla v27.4s, v21.4s, v0.s[0]\n"
      "sub x20, x20, #0x1\n"
      "fmla v28.4s, v22.4s, v0.s[0]\n"
      "cbnz x20, 55b\n"
      "56:"  // Width 5: Multiply loop: No odd multiplies
      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
      "tbz %x[flags], #1, 57f\n"
      "add x19, %x[args_ptr], %[offset_min]\n"
      "ld1r { v17.4s }, [x19]\n"
      "add x19, %x[args_ptr], %[offset_max]\n"
      "ld1r { v16.4s }, [x19]\n"
      "fmin v24.4s, v24.4s, v16.4s\n"
      "fmin v25.4s, v25.4s, v16.4s\n"
      "fmin v26.4s, v26.4s, v16.4s\n"
      "fmin v27.4s, v27.4s, v16.4s\n"
      "fmax v24.4s, v24.4s, v17.4s\n"
      "fmax v25.4s, v25.4s, v17.4s\n"
      "fmax v26.4s, v26.4s, v17.4s\n"
      "fmax v27.4s, v27.4s, v17.4s\n"
      "fmin v28.4s, v28.4s, v16.4s\n"
      "fmax v28.4s, v28.4s, v17.4s\n"
      "57:"  // Width 5: No activation
      "str q24, [%x[output_ptr], #0x0]\n"
      "str q25, [%x[output_ptr], #0x10]\n"
      "str q26, [%x[output_ptr], #0x20]\n"
      "str q27, [%x[output_ptr], #0x30]\n"
      "cmp %x[N], #0x14\n"
      "add %x[output_ptr], %x[output_ptr], #0x40\n"
      "blt 58f\n"
      "str q28, [%x[output_ptr], #0x0]\n"
      "add %x[output_ptr], %x[output_ptr], #0x10\n"
      "b 60f\n"
      "58:"  // Width 5: Partial writeback
      "tbz %x[N], #1, 59f\n"
      "str d28, [%x[output_ptr]], #0x8\n"
      "tbz %x[N], #0, 60f\n"
      "st1 { v28.s }[2], [%x[output_ptr]]\n"
      "b 60f\n"
      "59:"  // Width 5: Partial direct writeback: partial_1_16
      "tbz %x[N], #0, 60f\n"
      "str s28, [%x[output_ptr], #0x0]\n"
      "60:"  // Width 5: Writeback done
      "b 97f\n"
      "61:"  // Width 6
      "mov x20, %x[K]\n"
      "mov x19, %x[A_ptr]\n"
      "cbz x21, 62f\n"
      "ldr q24, [x21, #0x0]\n"
      "ldr q25, [x21, #0x10]\n"
      "ldr q26, [x21, #0x20]\n"
      "ldr q27, [x21, #0x30]\n"
      "ldr q28, [x21, #0x40]\n"
      "ldr q29, [x21, #0x50]\n"
      "add x21, x21, #0x60\n"
      "b 63f\n"
      "62:"  // Width 6: no bias
      "movi v24.16b, #0x0\n"
      "movi v25.16b, #0x0\n"
      "movi v26.16b, #0x0\n"
      "movi v27.16b, #0x0\n"
      "movi v28.16b, #0x0\n"
      "movi v29.16b, #0x0\n"
      "63:"  // Width 6: setup done
      "cmp x20, #0x4\n"
      "blt 66f\n"
      "cmp x20, #0x8\n"
      "blt 65f\n"
      "64:"  // Width 6: Multiply loop: Main loop head
      "ldr q0, [x19, #0x0]\n"
      "ldr q1, [%x[B_ptr], #0x0]\n"
      "fmla v24.4s, v1.4s, v0.s[0]\n"
      "ldr q2, [%x[B_ptr], #0x10]\n"
      "ldr q3, [%x[B_ptr], #0x20]\n"
      "fmla v25.4s, v2.4s, v0.s[0]\n"
      "ldr q4, [%x[B_ptr], #0x30]\n"
      "fmla v26.4s, v3.4s, v0.s[0]\n"
      "ldr q5, [%x[B_ptr], #0x40]\n"
      "ldr q6, [%x[B_ptr], #0x50]\n"
      "fmla v27.4s, v4.4s, v0.s[0]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
      "fmla v28.4s, v5.4s, v0.s[0]\n"
      "ldr q7, [%x[B_ptr], #0x0]\n"
      "fmla v29.4s, v6.4s, v0.s[0]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
      "ldr q8, [%x[B_ptr], #0x10]\n"
      "fmla v24.4s, v7.4s, v0.s[1]\n"
      "ldr q9, [%x[B_ptr], #0x20]\n"
      "ldr q10, [%x[B_ptr], #0x30]\n"
      "fmla v25.4s, v8.4s, v0.s[1]\n"
      "ldr q11, [%x[B_ptr], #0x40]\n"
      "fmla v26.4s, v9.4s, v0.s[1]\n"
      "ldr q12, [%x[B_ptr], #0x50]\n"
      "fmla v27.4s, v10.4s, v0.s[1]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
      "fmla v28.4s, v11.4s, v0.s[1]\n"
      "ldr q13, [%x[B_ptr], #0x0]\n"
      "fmla v29.4s, v12.4s, v0.s[1]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
      "ldr q14, [%x[B_ptr], #0x10]\n"
      "fmla v24.4s, v13.4s, v0.s[2]\n"
      "ldr q15, [%x[B_ptr], #0x20]\n"
      "ldr q16, [%x[B_ptr], #0x30]\n"
      "fmla v25.4s, v14.4s, v0.s[2]\n"
      "ldr q17, [%x[B_ptr], #0x40]\n"
      "ldr q18, [%x[B_ptr], #0x50]\n"
      "fmla v26.4s, v15.4s, v0.s[2]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
      "fmla v27.4s, v16.4s, v0.s[2]\n"
      "ldr q19, [%x[B_ptr], #0x0]\n"
      "fmla v28.4s, v17.4s, v0.s[2]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
      "ldr q20, [%x[B_ptr], #0x10]\n"
      "fmla v29.4s, v18.4s, v0.s[2]\n"
      "ldr q21, [%x[B_ptr], #0x20]\n"
      "ldr q22, [%x[B_ptr], #0x30]\n"
      "fmla v24.4s, v19.4s, v0.s[3]\n"
      "ldr q23, [%x[B_ptr], #0x40]\n"
      "ldr q1, [%x[B_ptr], #0x50]\n"
      "fmla v25.4s, v20.4s, v0.s[3]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
      "fmla v26.4s, v21.4s, v0.s[3]\n"
      "add x19, x19, #0x10\n"
      "fmla v27.4s, v22.4s, v0.s[3]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
      "sub x20, x20, #0x4\n"
      "fmla v28.4s, v23.4s, v0.s[3]\n"
      "prfm pldl1keep, [x19, #0x80]\n"
      "cmp x20, #0x8\n"
      "fmla v29.4s, v1.4s, v0.s[3]\n"
      "bge 64b\n"
      "65:"  // Width 6: Multiply loop: Single iteration only
      "sub x20, x20, #0x4\n"
      "ldr q0, [x19, #0x0]\n"
      "ldr q2, [%x[B_ptr], #0x0]\n"
      "fmla v24.4s, v2.4s, v0.s[0]\n"
      "ldr q3, [%x[B_ptr], #0x10]\n"
      "ldr q4, [%x[B_ptr], #0x20]\n"
      "fmla v25.4s, v3.4s, v0.s[0]\n"
      "ldr q5, [%x[B_ptr], #0x30]\n"
      "fmla v26.4s, v4.4s, v0.s[0]\n"
      "ldr q6, [%x[B_ptr], #0x40]\n"
      "ldr q7, [%x[B_ptr], #0x50]\n"
      "fmla v27.4s, v5.4s, v0.s[0]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
      "fmla v28.4s, v6.4s, v0.s[0]\n"
      "ldr q8, [%x[B_ptr], #0x0]\n"
      "fmla v29.4s, v7.4s, v0.s[0]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
      "ldr q9, [%x[B_ptr], #0x10]\n"
      "fmla v24.4s, v8.4s, v0.s[1]\n"
      "ldr q10, [%x[B_ptr], #0x20]\n"
      "ldr q11, [%x[B_ptr], #0x30]\n"
      "fmla v25.4s, v9.4s, v0.s[1]\n"
      "ldr q12, [%x[B_ptr], #0x40]\n"
      "fmla v26.4s, v10.4s, v0.s[1]\n"
      "ldr q13, [%x[B_ptr], #0x50]\n"
      "fmla v27.4s, v11.4s, v0.s[1]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
      "fmla v28.4s, v12.4s, v0.s[1]\n"
      "ldr q14, [%x[B_ptr], #0x0]\n"
      "fmla v29.4s, v13.4s, v0.s[1]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
      "ldr q15, [%x[B_ptr], #0x10]\n"
      "fmla v24.4s, v14.4s, v0.s[2]\n"
      "ldr q16, [%x[B_ptr], #0x20]\n"
      "ldr q17, [%x[B_ptr], #0x30]\n"
      "fmla v25.4s, v15.4s, v0.s[2]\n"
      "ldr q18, [%x[B_ptr], #0x40]\n"
      "ldr q19, [%x[B_ptr], #0x50]\n"
      "fmla v26.4s, v16.4s, v0.s[2]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
      "fmla v27.4s, v17.4s, v0.s[2]\n"
      "ldr q20, [%x[B_ptr], #0x0]\n"
      "fmla v28.4s, v18.4s, v0.s[2]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
      "ldr q21, [%x[B_ptr], #0x10]\n"
      "fmla v29.4s, v19.4s, v0.s[2]\n"
      "ldr q22, [%x[B_ptr], #0x20]\n"
      "ldr q23, [%x[B_ptr], #0x30]\n"
      "fmla v24.4s, v20.4s, v0.s[3]\n"
      "ldr q1, [%x[B_ptr], #0x40]\n"
      "ldr q2, [%x[B_ptr], #0x50]\n"
      "fmla v25.4s, v21.4s, v0.s[3]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
      "fmla v26.4s, v22.4s, v0.s[3]\n"
      "add x19, x19, #0x10\n"
      "fmla v27.4s, v23.4s, v0.s[3]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
      "prfm pldl1keep, [x19, #0x80]\n"
      "fmla v28.4s, v1.4s, v0.s[3]\n"
      "fmla v29.4s, v2.4s, v0.s[3]\n"
      "66:"  // Width 6: Multiply loop: Main loop skip
      "cbz x20, 68f\n"
      "67:"  // Width 6: Multiply loop: Odd block loop
      "ldr s0, [x19], #0x4\n"
      "ldr q3, [%x[B_ptr], #0x0]\n"
      "fmla v24.4s, v3.4s, v0.s[0]\n"
      "ldr q4, [%x[B_ptr], #0x10]\n"
      "ldr q5, [%x[B_ptr], #0x20]\n"
      "fmla v25.4s, v4.4s, v0.s[0]\n"
      "ldr q6, [%x[B_ptr], #0x30]\n"
      "fmla v26.4s, v5.4s, v0.s[0]\n"
      "ldr q7, [%x[B_ptr], #0x40]\n"
      "ldr q8, [%x[B_ptr], #0x50]\n"
      "fmla v27.4s, v6.4s, v0.s[0]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "sub x20, x20, #0x1\n"
      "fmla v28.4s, v7.4s, v0.s[0]\n"
      "fmla v29.4s, v8.4s, v0.s[0]\n"
      "cbnz x20, 67b\n"
      "68:"  // Width 6: Multiply loop: No odd multiplies
      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
      "tbz %x[flags], #1, 69f\n"
      "add x19, %x[args_ptr], %[offset_min]\n"
      "ld1r { v17.4s }, [x19]\n"
      "add x19, %x[args_ptr], %[offset_max]\n"
      "ld1r { v16.4s }, [x19]\n"
      "fmin v24.4s, v24.4s, v16.4s\n"
      "fmin v25.4s, v25.4s, v16.4s\n"
      "fmin v26.4s, v26.4s, v16.4s\n"
      "fmin v27.4s, v27.4s, v16.4s\n"
      "fmax v24.4s, v24.4s, v17.4s\n"
      "fmax v25.4s, v25.4s, v17.4s\n"
      "fmax v26.4s, v26.4s, v17.4s\n"
      "fmax v27.4s, v27.4s, v17.4s\n"
      "fmin v28.4s, v28.4s, v16.4s\n"
      "fmin v29.4s, v29.4s, v16.4s\n"
      "fmax v28.4s, v28.4s, v17.4s\n"
      "fmax v29.4s, v29.4s, v17.4s\n"
      "69:"  // Width 6: No activation
      "str q24, [%x[output_ptr], #0x0]\n"
      "str q25, [%x[output_ptr], #0x10]\n"
      "str q26, [%x[output_ptr], #0x20]\n"
      "str q27, [%x[output_ptr], #0x30]\n"
      "str q28, [%x[output_ptr], #0x40]\n"
      "cmp %x[N], #0x18\n"
      "add %x[output_ptr], %x[output_ptr], #0x50\n"
      "blt 70f\n"
      "str q29, [%x[output_ptr], #0x0]\n"
      "add %x[output_ptr], %x[output_ptr], #0x10\n"
      "b 72f\n"
      "70:"  // Width 6: Partial writeback
      "tbz %x[N], #1, 71f\n"
      "str d29, [%x[output_ptr]], #0x8\n"
      "tbz %x[N], #0, 72f\n"
      "st1 { v29.s }[2], [%x[output_ptr]]\n"
      "b 72f\n"
      "71:"  // Width 6: Partial direct writeback: partial_1_20
      "tbz %x[N], #0, 72f\n"
      "str s29, [%x[output_ptr], #0x0]\n"
      "72:"  // Width 6: Writeback done
      "b 97f\n"
      "73:"  // Width 7
      "mov x20, %x[K]\n"
      "mov x19, %x[A_ptr]\n"
      "cbz x21, 74f\n"
      "ldr q24, [x21, #0x0]\n"
      "ldr q25, [x21, #0x10]\n"
      "ldr q26, [x21, #0x20]\n"
      "ldr q27, [x21, #0x30]\n"
      "ldr q28, [x21, #0x40]\n"
      "ldr q29, [x21, #0x50]\n"
      "ldr q30, [x21, #0x60]\n"
      "add x21, x21, #0x70\n"
      "b 75f\n"
      "74:"  // Width 7: no bias
      "movi v24.16b, #0x0\n"
      "movi v25.16b, #0x0\n"
      "movi v26.16b, #0x0\n"
      "movi v27.16b, #0x0\n"
      "movi v28.16b, #0x0\n"
      "movi v29.16b, #0x0\n"
      "movi v30.16b, #0x0\n"
      "75:"  // Width 7: setup done
      "cmp x20, #0x4\n"
      "blt 78f\n"
      "cmp x20, #0x8\n"
      "blt 77f\n"
      "76:"  // Width 7: Multiply loop: Main loop head
      "ldr q0, [x19, #0x0]\n"
      "ldr q1, [%x[B_ptr], #0x0]\n"
      "fmla v24.4s, v1.4s, v0.s[0]\n"
      "ldr q2, [%x[B_ptr], #0x10]\n"
      "ldr q3, [%x[B_ptr], #0x20]\n"
      "fmla v25.4s, v2.4s, v0.s[0]\n"
      "ldr q4, [%x[B_ptr], #0x30]\n"
      "fmla v26.4s, v3.4s, v0.s[0]\n"
      "ldr q5, [%x[B_ptr], #0x40]\n"
      "ldr q6, [%x[B_ptr], #0x50]\n"
      "fmla v27.4s, v4.4s, v0.s[0]\n"
      "ldr q7, [%x[B_ptr], #0x60]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "fmla v28.4s, v5.4s, v0.s[0]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
      "fmla v29.4s, v6.4s, v0.s[0]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
      "ldr q8, [%x[B_ptr], #0x0]\n"
      "fmla v30.4s, v7.4s, v0.s[0]\n"
      "ldr q9, [%x[B_ptr], #0x10]\n"
      "ldr q10, [%x[B_ptr], #0x20]\n"
      "fmla v24.4s, v8.4s, v0.s[1]\n"
      "ldr q11, [%x[B_ptr], #0x30]\n"
      "ldr q12, [%x[B_ptr], #0x40]\n"
      "fmla v25.4s, v9.4s, v0.s[1]\n"
      "ldr q13, [%x[B_ptr], #0x50]\n"
      "fmla v26.4s, v10.4s, v0.s[1]\n"
      "ldr q14, [%x[B_ptr], #0x60]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "fmla v27.4s, v11.4s, v0.s[1]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
      "fmla v28.4s, v12.4s, v0.s[1]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
      "ldr q15, [%x[B_ptr], #0x0]\n"
      "fmla v29.4s, v13.4s, v0.s[1]\n"
      "ldr q16, [%x[B_ptr], #0x10]\n"
      "ldr q17, [%x[B_ptr], #0x20]\n"
      "fmla v30.4s, v14.4s, v0.s[1]\n"
      "ldr q18, [%x[B_ptr], #0x30]\n"
      "fmla v24.4s, v15.4s, v0.s[2]\n"
      "ldr q19, [%x[B_ptr], #0x40]\n"
      "ldr q20, [%x[B_ptr], #0x50]\n"
      "fmla v25.4s, v16.4s, v0.s[2]\n"
      "ldr q21, [%x[B_ptr], #0x60]\n"
      "fmla v26.4s, v17.4s, v0.s[2]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
      "fmla v27.4s, v18.4s, v0.s[2]\n"
      "ldr q22, [%x[B_ptr], #0x0]\n"
      "fmla v28.4s, v19.4s, v0.s[2]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
      "ldr q23, [%x[B_ptr], #0x10]\n"
      "fmla v29.4s, v20.4s, v0.s[2]\n"
      "ldr q1, [%x[B_ptr], #0x20]\n"
      "ldr q2, [%x[B_ptr], #0x30]\n"
      "fmla v30.4s, v21.4s, v0.s[2]\n"
      "ldr q3, [%x[B_ptr], #0x40]\n"
      "fmla v24.4s, v22.4s, v0.s[3]\n"
      "ldr q4, [%x[B_ptr], #0x50]\n"
      "ldr q5, [%x[B_ptr], #0x60]\n"
      "fmla v25.4s, v23.4s, v0.s[3]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "fmla v26.4s, v1.4s, v0.s[3]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
      "fmla v27.4s, v2.4s, v0.s[3]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
      "add x19, x19, #0x10\n"
      "fmla v28.4s, v3.4s, v0.s[3]\n"
      "prfm pldl1keep, [x19, #0x80]\n"
      "sub x20, x20, #0x4\n"
      "fmla v29.4s, v4.4s, v0.s[3]\n"
      "cmp x20, #0x8\n"
      "fmla v30.4s, v5.4s, v0.s[3]\n"
      "bge 76b\n"
      "77:"  // Width 7: Multiply loop: Single iteration only
      "sub x20, x20, #0x4\n"
      "ldr q0, [x19, #0x0]\n"
      "ldr q6, [%x[B_ptr], #0x0]\n"
      "fmla v24.4s, v6.4s, v0.s[0]\n"
      "ldr q7, [%x[B_ptr], #0x10]\n"
      "ldr q8, [%x[B_ptr], #0x20]\n"
      "fmla v25.4s, v7.4s, v0.s[0]\n"
      "ldr q9, [%x[B_ptr], #0x30]\n"
      "fmla v26.4s, v8.4s, v0.s[0]\n"
      "ldr q10, [%x[B_ptr], #0x40]\n"
      "ldr q11, [%x[B_ptr], #0x50]\n"
      "fmla v27.4s, v9.4s, v0.s[0]\n"
      "ldr q12, [%x[B_ptr], #0x60]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "fmla v28.4s, v10.4s, v0.s[0]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
      "fmla v29.4s, v11.4s, v0.s[0]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
      "ldr q13, [%x[B_ptr], #0x0]\n"
      "fmla v30.4s, v12.4s, v0.s[0]\n"
      "ldr q14, [%x[B_ptr], #0x10]\n"
      "ldr q15, [%x[B_ptr], #0x20]\n"
      "fmla v24.4s, v13.4s, v0.s[1]\n"
      "ldr q16, [%x[B_ptr], #0x30]\n"
      "ldr q17, [%x[B_ptr], #0x40]\n"
      "fmla v25.4s, v14.4s, v0.s[1]\n"
      "ldr q18, [%x[B_ptr], #0x50]\n"
      "fmla v26.4s, v15.4s, v0.s[1]\n"
      "ldr q19, [%x[B_ptr], #0x60]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "fmla v27.4s, v16.4s, v0.s[1]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
      "fmla v28.4s, v17.4s, v0.s[1]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
      "ldr q20, [%x[B_ptr], #0x0]\n"
      "fmla v29.4s, v18.4s, v0.s[1]\n"
      "ldr q21, [%x[B_ptr], #0x10]\n"
      "ldr q22, [%x[B_ptr], #0x20]\n"
      "fmla v30.4s, v19.4s, v0.s[1]\n"
      "ldr q23, [%x[B_ptr], #0x30]\n"
      "fmla v24.4s, v20.4s, v0.s[2]\n"
      "ldr q1, [%x[B_ptr], #0x40]\n"
      "ldr q2, [%x[B_ptr], #0x50]\n"
      "fmla v25.4s, v21.4s, v0.s[2]\n"
      "ldr q3, [%x[B_ptr], #0x60]\n"
      "fmla v26.4s, v22.4s, v0.s[2]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
      "fmla v27.4s, v23.4s, v0.s[2]\n"
      "ldr q4, [%x[B_ptr], #0x0]\n"
      "fmla v28.4s, v1.4s, v0.s[2]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
      "ldr q5, [%x[B_ptr], #0x10]\n"
      "fmla v29.4s, v2.4s, v0.s[2]\n"
      "ldr q6, [%x[B_ptr], #0x20]\n"
      "ldr q7, [%x[B_ptr], #0x30]\n"
      "fmla v30.4s, v3.4s, v0.s[2]\n"
      "ldr q8, [%x[B_ptr], #0x40]\n"
      "fmla v24.4s, v4.4s, v0.s[3]\n"
      "ldr q9, [%x[B_ptr], #0x50]\n"
      "ldr q10, [%x[B_ptr], #0x60]\n"
      "fmla v25.4s, v5.4s, v0.s[3]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "fmla v26.4s, v6.4s, v0.s[3]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
      "fmla v27.4s, v7.4s, v0.s[3]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
      "add x19, x19, #0x10\n"
      "fmla v28.4s, v8.4s, v0.s[3]\n"
      "prfm pldl1keep, [x19, #0x80]\n"
      "fmla v29.4s, v9.4s, v0.s[3]\n"
      "fmla v30.4s, v10.4s, v0.s[3]\n"
      "78:"  // Width 7: Multiply loop: Main loop skip
      "cbz x20, 80f\n"
      "79:"  // Width 7: Multiply loop: Odd block loop
      "ldr s0, [x19], #0x4\n"
      "ldr q11, [%x[B_ptr], #0x0]\n"
      "fmla v24.4s, v11.4s, v0.s[0]\n"
      "ldr q12, [%x[B_ptr], #0x10]\n"
      "ldr q13, [%x[B_ptr], #0x20]\n"
      "fmla v25.4s, v12.4s, v0.s[0]\n"
      "ldr q14, [%x[B_ptr], #0x30]\n"
      "fmla v26.4s, v13.4s, v0.s[0]\n"
      "ldr q15, [%x[B_ptr], #0x40]\n"
      "ldr q16, [%x[B_ptr], #0x50]\n"
      "fmla v27.4s, v14.4s, v0.s[0]\n"
      "ldr q17, [%x[B_ptr], #0x60]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "fmla v28.4s, v15.4s, v0.s[0]\n"
      "fmla v29.4s, v16.4s, v0.s[0]\n"
      "sub x20, x20, #0x1\n"
      "fmla v30.4s, v17.4s, v0.s[0]\n"
      "cbnz x20, 79b\n"
      "80:"  // Width 7: Multiply loop: No odd multiplies
      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
      "tbz %x[flags], #1, 81f\n"
      "add x19, %x[args_ptr], %[offset_min]\n"
      "ld1r { v17.4s }, [x19]\n"
      "add x19, %x[args_ptr], %[offset_max]\n"
      "ld1r { v16.4s }, [x19]\n"
      "fmin v24.4s, v24.4s, v16.4s\n"
      "fmin v25.4s, v25.4s, v16.4s\n"
      "fmin v26.4s, v26.4s, v16.4s\n"
      "fmin v27.4s, v27.4s, v16.4s\n"
      "fmax v24.4s, v24.4s, v17.4s\n"
      "fmax v25.4s, v25.4s, v17.4s\n"
      "fmax v26.4s, v26.4s, v17.4s\n"
      "fmax v27.4s, v27.4s, v17.4s\n"
      "fmin v28.4s, v28.4s, v16.4s\n"
      "fmin v29.4s, v29.4s, v16.4s\n"
      "fmin v30.4s, v30.4s, v16.4s\n"
      "fmax v28.4s, v28.4s, v17.4s\n"
      "fmax v29.4s, v29.4s, v17.4s\n"
      "fmax v30.4s, v30.4s, v17.4s\n"
      "81:"  // Width 7: No activation
      "str q24, [%x[output_ptr], #0x0]\n"
      "str q25, [%x[output_ptr], #0x10]\n"
      "str q26, [%x[output_ptr], #0x20]\n"
      "str q27, [%x[output_ptr], #0x30]\n"
      "str q28, [%x[output_ptr], #0x40]\n"
      "str q29, [%x[output_ptr], #0x50]\n"
      "cmp %x[N], #0x1c\n"
      "add %x[output_ptr], %x[output_ptr], #0x60\n"
      "blt 82f\n"
      "str q30, [%x[output_ptr], #0x0]\n"
      "add %x[output_ptr], %x[output_ptr], #0x10\n"
      "b 84f\n"
      "82:"  // Width 7: Partial writeback
      "tbz %x[N], #1, 83f\n"
      "str d30, [%x[output_ptr]], #0x8\n"
      "tbz %x[N], #0, 84f\n"
      "st1 { v30.s }[2], [%x[output_ptr]]\n"
      "b 84f\n"
      "83:"  // Width 7: Partial direct writeback: partial_1_24
      "tbz %x[N], #0, 84f\n"
      "str s30, [%x[output_ptr], #0x0]\n"
      "84:"  // Width 7: Writeback done
      "b 97f\n"
      "85:"  // Width 8
      "mov x20, %x[K]\n"
      "mov x19, %x[A_ptr]\n"
      "cbz x21, 86f\n"
      "ldr q24, [x21, #0x0]\n"
      "ldr q25, [x21, #0x10]\n"
      "ldr q26, [x21, #0x20]\n"
      "ldr q27, [x21, #0x30]\n"
      "ldr q28, [x21, #0x40]\n"
      "ldr q29, [x21, #0x50]\n"
      "ldr q30, [x21, #0x60]\n"
      "ldr q31, [x21, #0x70]\n"
      "add x21, x21, #0x80\n"
      "b 87f\n"
      "86:"  // Width 8: no bias
      "movi v24.16b, #0x0\n"
      "movi v25.16b, #0x0\n"
      "movi v26.16b, #0x0\n"
      "movi v27.16b, #0x0\n"
      "movi v28.16b, #0x0\n"
      "movi v29.16b, #0x0\n"
      "movi v30.16b, #0x0\n"
      "movi v31.16b, #0x0\n"
      "87:"  // Width 8: setup done
      "cmp x20, #0x4\n"
      "blt 90f\n"
      "cmp x20, #0x8\n"
      "blt 89f\n"
      "88:"  // Width 8: Multiply loop: Main loop head
      "ldr q0, [x19, #0x0]\n"
      "ldr q1, [%x[B_ptr], #0x0]\n"
      "fmla v24.4s, v1.4s, v0.s[0]\n"
      "ldr q2, [%x[B_ptr], #0x10]\n"
      "ldr q3, [%x[B_ptr], #0x20]\n"
      "fmla v25.4s, v2.4s, v0.s[0]\n"
      "ldr q4, [%x[B_ptr], #0x30]\n"
      "fmla v26.4s, v3.4s, v0.s[0]\n"
      "ldr q5, [%x[B_ptr], #0x40]\n"
      "ldr q6, [%x[B_ptr], #0x50]\n"
      "fmla v27.4s, v4.4s, v0.s[0]\n"
      "ldr q7, [%x[B_ptr], #0x60]\n"
      "ldr q8, [%x[B_ptr], #0x70]\n"
      "fmla v28.4s, v5.4s, v0.s[0]\n"
      "fmla v29.4s, v6.4s, v0.s[0]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
      "fmla v30.4s, v7.4s, v0.s[0]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
      "ldr q9, [%x[B_ptr], #0x0]\n"
      "fmla v31.4s, v8.4s, v0.s[0]\n"
      "ldr q10, [%x[B_ptr], #0x10]\n"
      "ldr q11, [%x[B_ptr], #0x20]\n"
      "fmla v24.4s, v9.4s, v0.s[1]\n"
      "ldr q12, [%x[B_ptr], #0x30]\n"
      "ldr q13, [%x[B_ptr], #0x40]\n"
      "fmla v25.4s, v10.4s, v0.s[1]\n"
      "fmla v26.4s, v11.4s, v0.s[1]\n"
      "ldr q14, [%x[B_ptr], #0x50]\n"
      "ldr q15, [%x[B_ptr], #0x60]\n"
      "fmla v27.4s, v12.4s, v0.s[1]\n"
      "ldr q16, [%x[B_ptr], #0x70]\n"
      "fmla v28.4s, v13.4s, v0.s[1]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
      "fmla v29.4s, v14.4s, v0.s[1]\n"
      "ldr q17, [%x[B_ptr], #0x0]\n"
      "fmla v30.4s, v15.4s, v0.s[1]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
      "ldr q18, [%x[B_ptr], #0x10]\n"
      "fmla v31.4s, v16.4s, v0.s[1]\n"
      "ldr q19, [%x[B_ptr], #0x20]\n"
      "ldr q20, [%x[B_ptr], #0x30]\n"
      "fmla v24.4s, v17.4s, v0.s[2]\n"
      "ldr q21, [%x[B_ptr], #0x40]\n"
      "ldr q22, [%x[B_ptr], #0x50]\n"
      "fmla v25.4s, v18.4s, v0.s[2]\n"
      "ldr q23, [%x[B_ptr], #0x60]\n"
      "fmla v26.4s, v19.4s, v0.s[2]\n"
      "ldr q1, [%x[B_ptr], #0x70]\n"
      "fmla v27.4s, v20.4s, v0.s[2]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
      "fmla v28.4s, v21.4s, v0.s[2]\n"
      "ldr q2, [%x[B_ptr], #0x0]\n"
      "fmla v29.4s, v22.4s, v0.s[2]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
      "ldr q3, [%x[B_ptr], #0x10]\n"
      "fmla v30.4s, v23.4s, v0.s[2]\n"
      "ldr q4, [%x[B_ptr], #0x20]\n"
      "ldr q5, [%x[B_ptr], #0x30]\n"
      "fmla v31.4s, v1.4s, v0.s[2]\n"
      "ldr q6, [%x[B_ptr], #0x40]\n"
      "fmla v24.4s, v2.4s, v0.s[3]\n"
      "ldr q7, [%x[B_ptr], #0x50]\n"
      "ldr q8, [%x[B_ptr], #0x60]\n"
      "fmla v25.4s, v3.4s, v0.s[3]\n"
      "ldr q9, [%x[B_ptr], #0x70]\n"
      "fmla v26.4s, v4.4s, v0.s[3]\n"
      "fmla v27.4s, v5.4s, v0.s[3]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
      "fmla v28.4s, v6.4s, v0.s[3]\n"
      "add x19, x19, #0x10\n"
      "fmla v29.4s, v7.4s, v0.s[3]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
      "sub x20, x20, #0x4\n"
      "fmla v30.4s, v8.4s, v0.s[3]\n"
      "prfm pldl1keep, [x19, #0x80]\n"
      "cmp x20, #0x8\n"
      "fmla v31.4s, v9.4s, v0.s[3]\n"
      "bge 88b\n"
      "89:"  // Width 8: Multiply loop: Single iteration only
      "sub x20, x20, #0x4\n"
      "ldr q0, [x19, #0x0]\n"
      "ldr q10, [%x[B_ptr], #0x0]\n"
      "fmla v24.4s, v10.4s, v0.s[0]\n"
      "ldr q11, [%x[B_ptr], #0x10]\n"
      "ldr q12, [%x[B_ptr], #0x20]\n"
      "fmla v25.4s, v11.4s, v0.s[0]\n"
      "ldr q13, [%x[B_ptr], #0x30]\n"
      "fmla v26.4s, v12.4s, v0.s[0]\n"
      "ldr q14, [%x[B_ptr], #0x40]\n"
      "ldr q15, [%x[B_ptr], #0x50]\n"
      "fmla v27.4s, v13.4s, v0.s[0]\n"
      "ldr q16, [%x[B_ptr], #0x60]\n"
      "ldr q17, [%x[B_ptr], #0x70]\n"
      "fmla v28.4s, v14.4s, v0.s[0]\n"
      "fmla v29.4s, v15.4s, v0.s[0]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
      "fmla v30.4s, v16.4s, v0.s[0]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
      "ldr q18, [%x[B_ptr], #0x0]\n"
      "fmla v31.4s, v17.4s, v0.s[0]\n"
      "ldr q19, [%x[B_ptr], #0x10]\n"
      "ldr q20, [%x[B_ptr], #0x20]\n"
      "fmla v24.4s, v18.4s, v0.s[1]\n"
      "ldr q21, [%x[B_ptr], #0x30]\n"
      "ldr q22, [%x[B_ptr], #0x40]\n"
      "fmla v25.4s, v19.4s, v0.s[1]\n"
      "fmla v26.4s, v20.4s, v0.s[1]\n"
      "ldr q23, [%x[B_ptr], #0x50]\n"
      "ldr q1, [%x[B_ptr], #0x60]\n"
      "fmla v27.4s, v21.4s, v0.s[1]\n"
      "ldr q2, [%x[B_ptr], #0x70]\n"
      "fmla v28.4s, v22.4s, v0.s[1]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
      "fmla v29.4s, v23.4s, v0.s[1]\n"
      "ldr q3, [%x[B_ptr], #0x0]\n"
      "fmla v30.4s, v1.4s, v0.s[1]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
      "ldr q4, [%x[B_ptr], #0x10]\n"
      "fmla v31.4s, v2.4s, v0.s[1]\n"
      "ldr q5, [%x[B_ptr], #0x20]\n"
      "ldr q6, [%x[B_ptr], #0x30]\n"
      "fmla v24.4s, v3.4s, v0.s[2]\n"
      "ldr q7, [%x[B_ptr], #0x40]\n"
      "ldr q8, [%x[B_ptr], #0x50]\n"
      "fmla v25.4s, v4.4s, v0.s[2]\n"
      "ldr q9, [%x[B_ptr], #0x60]\n"
      "fmla v26.4s, v5.4s, v0.s[2]\n"
      "ldr q10, [%x[B_ptr], #0x70]\n"
      "fmla v27.4s, v6.4s, v0.s[2]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
      "fmla v28.4s, v7.4s, v0.s[2]\n"
      "ldr q11, [%x[B_ptr], #0x0]\n"
      "fmla v29.4s, v8.4s, v0.s[2]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
      "ldr q12, [%x[B_ptr], #0x10]\n"
      "fmla v30.4s, v9.4s, v0.s[2]\n"
      "ldr q13, [%x[B_ptr], #0x20]\n"
      "ldr q14, [%x[B_ptr], #0x30]\n"
      "fmla v31.4s, v10.4s, v0.s[2]\n"
      "ldr q15, [%x[B_ptr], #0x40]\n"
      "fmla v24.4s, v11.4s, v0.s[3]\n"
      "ldr q16, [%x[B_ptr], #0x50]\n"
      "ldr q17, [%x[B_ptr], #0x60]\n"
      "fmla v25.4s, v12.4s, v0.s[3]\n"
      "ldr q18, [%x[B_ptr], #0x70]\n"
      "fmla v26.4s, v13.4s, v0.s[3]\n"
      "fmla v27.4s, v14.4s, v0.s[3]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
      "fmla v28.4s, v15.4s, v0.s[3]\n"
      "add x19, x19, #0x10\n"
      "fmla v29.4s, v16.4s, v0.s[3]\n"
      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
      "fmla v30.4s, v17.4s, v0.s[3]\n"
      "prfm pldl1keep, [x19, #0x80]\n"
      "fmla v31.4s, v18.4s, v0.s[3]\n"
      "90:"  // Width 8: Multiply loop: Main loop skip
      "cbz x20, 92f\n"
      "91:"  // Width 8: Multiply loop: Odd block loop
      "ldr s0, [x19], #0x4\n"
      "ldr q19, [%x[B_ptr], #0x0]\n"
      "fmla v24.4s, v19.4s, v0.s[0]\n"
      "ldr q20, [%x[B_ptr], #0x10]\n"
      "ldr q21, [%x[B_ptr], #0x20]\n"
      "fmla v25.4s, v20.4s, v0.s[0]\n"
      "ldr q22, [%x[B_ptr], #0x30]\n"
      "fmla v26.4s, v21.4s, v0.s[0]\n"
      "ldr q23, [%x[B_ptr], #0x40]\n"
      "ldr q1, [%x[B_ptr], #0x50]\n"
      "fmla v27.4s, v22.4s, v0.s[0]\n"
      "ldr q2, [%x[B_ptr], #0x60]\n"
      "ldr q3, [%x[B_ptr], #0x70]\n"
      "fmla v28.4s, v23.4s, v0.s[0]\n"
      "fmla v29.4s, v1.4s, v0.s[0]\n"
      "add %x[B_ptr], %x[B_ptr], #0x80\n"
      "sub x20, x20, #0x1\n"
      "fmla v30.4s, v2.4s, v0.s[0]\n"
      "fmla v31.4s, v3.4s, v0.s[0]\n"
      "cbnz x20, 91b\n"
      "92:"  // Width 8: Multiply loop: No odd multiplies
      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
      "tbz %x[flags], #1, 93f\n"
      "add x19, %x[args_ptr], %[offset_min]\n"
      "ld1r { v17.4s }, [x19]\n"
      "add x19, %x[args_ptr], %[offset_max]\n"
      "ld1r { v16.4s }, [x19]\n"
      "fmin v24.4s, v24.4s, v16.4s\n"
      "fmin v25.4s, v25.4s, v16.4s\n"
      "fmin v26.4s, v26.4s, v16.4s\n"
      "fmin v27.4s, v27.4s, v16.4s\n"
      "fmax v24.4s, v24.4s, v17.4s\n"
      "fmax v25.4s, v25.4s, v17.4s\n"
      "fmax v26.4s, v26.4s, v17.4s\n"
      "fmax v27.4s, v27.4s, v17.4s\n"
      "fmin v28.4s, v28.4s, v16.4s\n"
      "fmin v29.4s, v29.4s, v16.4s\n"
      "fmin v30.4s, v30.4s, v16.4s\n"
      "fmax v28.4s, v28.4s, v17.4s\n"
      "fmax v29.4s, v29.4s, v17.4s\n"
      "fmax v30.4s, v30.4s, v17.4s\n"
      "fmin v31.4s, v31.4s, v16.4s\n"
      "fmax v31.4s, v31.4s, v17.4s\n"
      "93:"  // Width 8: No activation
      "str q24, [%x[output_ptr], #0x0]\n"
      "str q25, [%x[output_ptr], #0x10]\n"
      "str q26, [%x[output_ptr], #0x20]\n"
      "str q27, [%x[output_ptr], #0x30]\n"
      "str q28, [%x[output_ptr], #0x40]\n"
      "str q29, [%x[output_ptr], #0x50]\n"
      "str q30, [%x[output_ptr], #0x60]\n"
      "cmp %x[N], #0x20\n"
      "add %x[output_ptr], %x[output_ptr], #0x70\n"
      "blt 94f\n"
      "str q31, [%x[output_ptr], #0x0]\n"
      "add %x[output_ptr], %x[output_ptr], #0x10\n"
      "b 96f\n"
      "94:"  // Width 8: Partial writeback
      "tbz %x[N], #1, 95f\n"
      "str d31, [%x[output_ptr]], #0x8\n"
      "tbz %x[N], #0, 96f\n"
      "st1 { v31.s }[2], [%x[output_ptr]]\n"
      "b 96f\n"
      "95:"  // Width 8: Partial direct writeback: partial_1_28
      "tbz %x[N], #0, 96f\n"
      "str s31, [%x[output_ptr], #0x0]\n"
      "96:"  // Width 8: Writeback done
      "subs x22, x22, #0x8\n"
      "sub %x[N], %x[N], #0x20\n"
      "bgt 1b\n"
      "97:"  // Exit

      : [B_ptr] "+r" (B_ptr), [N] "+r" (N), [output_ptr] "+r" (output_ptr)
      : [A_ptr] "r" (A_ptr), [K] "r" (K), [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval))
      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22"
    );
}

} // namespace arm_gemm

#endif
